{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## word2vec训练中文模型\n",
    "------\n",
    "### 1.准备数据与预处理\n",
    "首先需要一份比较大的中文语料数据，可以考虑中文的维基百科（也可以试试搜狗的新闻语料库）。\n",
    "\n",
    "中文维基百科的打包文件地址为 \n",
    "链接: https://pan.baidu.com/s/1H-wuIve0d_fvczvy3EOKMQ 提取码: uqua \n",
    "\n",
    "百度网盘加速下载地址：https://www.baiduwp.com/?m=index\n",
    "\n",
    "中文维基百科的数据不是太大，xml的压缩文件大约1G左右。首先用处理这个XML压缩文件。\n",
    "\n",
    "**注意输入输出地址**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2019-05-08 21:42:31,184: INFO: running c:\\users\\mantch\\appdata\\local\\programs\\python\\python35\\lib\\site-packages\\ipykernel_launcher.py -f C:\\Users\\mantch\\AppData\\Roaming\\jupyter\\runtime\\kernel-30939db9-3a59-4a92-844c-704c6189dbef.json\n",
      "2019-05-08 21:43:12,274: INFO: Saved 10000 articles\n",
      "2019-05-08 21:43:45,223: INFO: Saved 20000 articles\n",
      "2019-05-08 21:44:14,638: INFO: Saved 30000 articles\n",
      "2019-05-08 21:44:44,601: INFO: Saved 40000 articles\n",
      "2019-05-08 21:45:16,004: INFO: Saved 50000 articles\n",
      "2019-05-08 21:45:47,421: INFO: Saved 60000 articles\n",
      "2019-05-08 21:46:16,722: INFO: Saved 70000 articles\n",
      "2019-05-08 21:46:46,733: INFO: Saved 80000 articles\n",
      "2019-05-08 21:47:16,143: INFO: Saved 90000 articles\n",
      "2019-05-08 21:47:47,533: INFO: Saved 100000 articles\n",
      "2019-05-08 21:48:29,591: INFO: Saved 110000 articles\n",
      "2019-05-08 21:49:04,530: INFO: Saved 120000 articles\n",
      "2019-05-08 21:49:40,279: INFO: Saved 130000 articles\n",
      "2019-05-08 21:50:15,592: INFO: Saved 140000 articles\n",
      "2019-05-08 21:50:54,183: INFO: Saved 150000 articles\n",
      "2019-05-08 21:51:31,123: INFO: Saved 160000 articles\n",
      "2019-05-08 21:52:06,278: INFO: Saved 170000 articles\n",
      "2019-05-08 21:52:43,157: INFO: Saved 180000 articles\n",
      "2019-05-08 21:55:59,809: INFO: Saved 190000 articles\n",
      "2019-05-08 21:57:01,859: INFO: Saved 200000 articles\n",
      "2019-05-08 21:58:33,921: INFO: Saved 210000 articles\n",
      "2019-05-08 21:59:26,744: INFO: Saved 220000 articles\n",
      "2019-05-08 22:00:41,757: INFO: Saved 230000 articles\n",
      "2019-05-08 22:01:36,532: INFO: Saved 240000 articles\n",
      "2019-05-08 22:02:26,347: INFO: Saved 250000 articles\n",
      "2019-05-08 22:03:08,634: INFO: Saved 260000 articles\n",
      "2019-05-08 22:03:53,447: INFO: Saved 270000 articles\n",
      "2019-05-08 22:04:37,136: INFO: Saved 280000 articles\n",
      "2019-05-08 22:05:14,017: INFO: Saved 290000 articles\n",
      "2019-05-08 22:06:01,296: INFO: Saved 300000 articles\n",
      "2019-05-08 22:06:47,762: INFO: Saved 310000 articles\n",
      "2019-05-08 22:07:39,714: INFO: Saved 320000 articles\n",
      "2019-05-08 22:08:28,825: INFO: Saved 330000 articles\n",
      "2019-05-08 22:09:11,412: INFO: finished iterating over Wikipedia corpus of 338005 documents with 77273203 positions (total 3288566 articles, 91445479 positions before pruning articles shorter than 50 words)\n",
      "2019-05-08 22:09:11,555: INFO: Finished Saved 338005 articles\n"
     ]
    }
   ],
   "source": [
    "import logging\n",
    "import os.path\n",
    "import sys\n",
    "from gensim.corpora import WikiCorpus\n",
    "if __name__ == '__main__':\n",
    "    \n",
    "    # 定义输入输出\n",
    "    basename = \"F:/temp/DL/\"\n",
    "    inp = basename+'zhwiki-latest-pages-articles.xml.bz2'\n",
    "    outp = basename+'wiki.zh.text'\n",
    "    \n",
    "    program = os.path.basename(basename)\n",
    "    logger = logging.getLogger(program)\n",
    "    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')\n",
    "    logging.root.setLevel(level=logging.INFO)\n",
    "    logger.info(\"running %s\" % ' '.join(sys.argv))\n",
    "    # check and process input arguments\n",
    "    if len(sys.argv) < 3:\n",
    "        print(globals()['__doc__'] % locals())\n",
    "        sys.exit(1)\n",
    "    \n",
    "    space = \" \"\n",
    "    i = 0\n",
    "    output = open(outp, 'w',encoding='utf-8')\n",
    "    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})\n",
    "    for text in wiki.get_texts():\n",
    "        output.write(space.join(text) + \"\\n\")\n",
    "        i = i + 1\n",
    "        if (i % 10000 == 0):\n",
    "            logger.info(\"Saved \" + str(i) + \" articles\")\n",
    "    output.close()\n",
    "    logger.info(\"Finished Saved \" + str(i) + \" articles\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "### 2.训练数据\n",
    "Python的话可用jieba完成分词，生成分词文件wiki.zh.text.seg \n",
    "接着用word2vec工具训练： \n",
    "\n",
    "**注意输入输出地址**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "import os.path\n",
    "import sys\n",
    "import multiprocessing\n",
    "from gensim.corpora import WikiCorpus\n",
    "from gensim.models import Word2Vec\n",
    "from gensim.models.word2vec import LineSentence\n",
    "\n",
    "# 定义输入输出\n",
    "basename = \"F:/temp/DL/\"\n",
    "inp = basename+'wiki.zh.text'\n",
    "outp1 = basename+'wiki.zh.text.model'\n",
    "outp2 = basename+'wiki.zh.text.vector'\n",
    "\n",
    "program = os.path.basename(basename)\n",
    "logger = logging.getLogger(program)\n",
    "logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')\n",
    "logging.root.setLevel(level=logging.INFO)\n",
    "logger.info(\"running %s\" % ' '.join(sys.argv))\n",
    "# check and process input arguments\n",
    "if len(sys.argv) < 4:\n",
    "    print(globals()['__doc__'] % locals())\n",
    "\n",
    "model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=5,\n",
    "        workers=multiprocessing.cpu_count())\n",
    "# trim unneeded model memory = use(much) less RAM\n",
    "#model.init_sims(replace=True)\n",
    "model.save(outp1)\n",
    "model.save_word2vec_format(outp2, binary=False)"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "输出如下：\n",
    "2019-05-08 22:28:25,638: INFO: running c:\\users\\mantch\\appdata\\local\\programs\\python\\python35\\lib\\site-packages\\ipykernel_launcher.py -f C:\\Users\\mantch\\AppData\\Roaming\\jupyter\\runtime\\kernel-b1f915fd-fdb2-43fc-bcf3-b361fb4a7c3d.json\n",
    "2019-05-08 22:28:25,640: INFO: collecting all words and their counts\n",
    "2019-05-08 22:28:25,642: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n",
    "Automatically created module for IPython interactive environment\n",
    "2019-05-08 22:28:27,887: INFO: PROGRESS: at sentence #10000, processed 4278620 words, keeping 2586311 word types\n",
    "2019-05-08 22:28:29,666: INFO: PROGRESS: at sentence #20000, processed 7491125 words, keeping 4291863 word types\n",
    "2019-05-08 22:28:31,445: INFO: PROGRESS: at sentence #30000, processed 10424455 words, keeping 5704507 word types\n",
    "2019-05-08 22:28:32,854: INFO: PROGRESS: at sentence #40000, processed 13190001 words, keeping 6983862 word types\n",
    "2019-05-08 22:28:34,125: INFO: PROGRESS: at sentence #50000, processed 15813238 words, keeping 8145905 word types\n",
    "2019-05-08 22:28:35,353: INFO: PROGRESS: at sentence #60000, processed 18388731 words, keeping 9198885 word types\n",
    "2019-05-08 22:28:36,544: INFO: PROGRESS: at sentence #70000, processed 20773000 words, keeping 10203788 word types\n",
    "2019-05-08 22:28:37,652: INFO: PROGRESS: at sentence #80000, processed 23064544 words, keeping 11144885 word types\n",
    "2019-05-08 22:28:39,490: INFO: PROGRESS: at sentence #90000, processed 25324650 words, keeping 12034343 word types\n",
    "2019-05-08 22:28:40,688: INFO: PROGRESS: at sentence #100000, processed 27672540 words, keeping 12878856 word types\n",
    "2019-05-08 22:28:41,871: INFO: PROGRESS: at sentence #110000, processed 29985282 words, keeping 13688622 word types\n",
    "2019-05-08 22:28:42,944: INFO: PROGRESS: at sentence #120000, processed 32025045 words, keeping 14477767 word types\n",
    "2019-05-08 22:28:44,048: INFO: PROGRESS: at sentence #130000, processed 34267390 words, keeping 15309447 word types\n",
    "2019-05-08 22:28:45,197: INFO: PROGRESS: at sentence #140000, processed 36451394 words, keeping 16090548 word types\n",
    "2019-05-08 22:28:46,345: INFO: PROGRESS: at sentence #150000, processed 38671717 words, keeping 16877015 word types\n",
    "2019-05-08 22:28:47,483: INFO: PROGRESS: at sentence #160000, processed 40778409 words, keeping 17648492 word types\n",
    "2019-05-08 22:28:48,655: INFO: PROGRESS: at sentence #170000, processed 43154040 words, keeping 18308373 word types\n",
    "2019-05-08 22:28:49,759: INFO: PROGRESS: at sentence #180000, processed 45231681 words, keeping 19010906 word types\n",
    "2019-05-08 22:28:50,826: INFO: PROGRESS: at sentence #190000, processed 47190144 words, keeping 19659373 word types\n",
    "2019-05-08 22:28:51,886: INFO: PROGRESS: at sentence #200000, processed 49201934 words, keeping 20311518 word types\n",
    "2019-05-08 22:28:52,856: INFO: PROGRESS: at sentence #210000, processed 51116197 words, keeping 20917125 word types\n",
    "2019-05-08 22:28:53,859: INFO: PROGRESS: at sentence #220000, processed 53321151 words, keeping 21513016 word types\n",
    "2019-05-08 22:28:54,921: INFO: PROGRESS: at sentence #230000, processed 55408211 words, keeping 22207241 word types\n",
    "2019-05-08 22:28:59,645: INFO: PROGRESS: at sentence #240000, processed 57442276 words, keeping 22849499 word types\n",
    "2019-05-08 22:29:00,988: INFO: PROGRESS: at sentence #250000, processed 59563975 words, keeping 23544817 word types\n",
    "2019-05-08 22:29:02,292: INFO: PROGRESS: at sentence #260000, processed 61764248 words, keeping 24222911 word types\n",
    "2019-05-08 22:29:03,654: INFO: PROGRESS: at sentence #270000, processed 63938511 words, keeping 24906453 word types\n",
    "2019-05-08 22:29:04,900: INFO: PROGRESS: at sentence #280000, processed 66096661 words, keeping 25519781 word types\n",
    "2019-05-08 22:29:06,057: INFO: PROGRESS: at sentence #290000, processed 67947209 words, keeping 26062482 word types\n",
    "2019-05-08 22:29:07,229: INFO: PROGRESS: at sentence #300000, processed 69927780 words, keeping 26649878 word types\n",
    "2019-05-08 22:29:08,506: INFO: PROGRESS: at sentence #310000, processed 71800313 words, keeping 27230264 word types\n",
    "2019-05-08 22:29:09,836: INFO: PROGRESS: at sentence #320000, processed 73942427 words, keeping 27850568 word types\n",
    "2019-05-08 22:29:11,419: INFO: PROGRESS: at sentence #330000, processed 75859220 words, keeping 28467061 word types\n",
    "2019-05-08 22:29:12,379: INFO: collected 28914285 word types from a corpus of 77273203 raw words and 338042 sentences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "### 3.测试结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\mantch\\appdata\\local\\programs\\python\\python35\\lib\\site-packages\\ipykernel_launcher.py:11: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
      "  # This is added back by InteractiveShellApp.init_path()\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "排球 0.8914323449134827\n",
      "籃球 0.8889479041099548\n",
      "棒球 0.854706883430481\n",
      "高爾夫 0.832783043384552\n",
      "高爾夫球 0.8316080570220947\n",
      "網球 0.8276922702789307\n",
      "橄欖球 0.823620080947876\n",
      "英式足球 0.8229209184646606\n",
      "板球 0.822044312953949\n",
      "欖球 0.8151556253433228\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# 测试结果\n",
    "import gensim\n",
    "\n",
    "# 定义输入输出\n",
    "basename = \"F:/temp/DL/\"\n",
    "model = basename+'wiki.zh.text.model'\n",
    "\n",
    "model = gensim.models.Word2Vec.load(model)\n",
    "\n",
    "result = model.most_similar(u\"足球\")\n",
    "for e in result:\n",
    "    print(e[0], e[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "女人 0.908246636390686\n",
      "男孩 0.872255802154541\n",
      "女孩 0.8567496538162231\n",
      "孩子 0.8363182544708252\n",
      "知道 0.8341636061668396\n",
      "某人 0.8211491107940674\n",
      "漂亮 0.8023637533187866\n",
      "伴侶 0.8001378774642944\n",
      "什麼 0.7944830656051636\n",
      "嫉妒 0.7929206490516663\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\mantch\\appdata\\local\\programs\\python\\python35\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "result = model.most_similar(u\"男人\")\n",
    "for e in result:\n",
    "    print(e[0], e[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
